home *** CD-ROM | disk | FTP | other *** search
Wrap
#include <stdio.h> #include <ctype.h> #include "util.h" #include "combine.h" /* * pass1: Perform the file read and table build pass * * This routine reads all of the records of all of the input files * into the cache, symbol table, and record array. * * This routine alternately reads one record from each file. Assuming * that the files have similar contents, this technique improves the * chance that a record from the second and third files will be in * the cache. * * Return value: * This procedure has no return value. */ void pass1 () { int i; /* Misc. variable */ int index; /* Index into record array */ int files_left; /* number of files left to do */ bool file_done[MAX_FILE_COUNT];/* TRUE if EOT reached on file */ int status; /* Misc. status variable */ /* * Initialize completion parameters. */ files_left = file_count; for (i = 0; i < file_count; ++i) { file_done[i] = FALSE; } /* * For each iteration of the file read the nth record of the each file. */ for (index = BEGIN_INDEX + 1; files_left != 0; ++index) { /* * Handle each file. */ for (i = 0; i < file_count; ++i) { if (!file_done[i]) { if (index + 1 >= files[i].record_array_alloc) { files[i].record_array_alloc += RA_INCR; files[i].record = (record_type *) realloc(files[i].record, files[i].record_array_alloc * sizeof (record_type)); if ( files[i].record == 0 ){ error( "out of memory -- files are way too big"); } } status = pass1_read_record (&files[i], index); if (status < 0) { file_done[i] = TRUE; files[i].record_array_size = index + 1; --files_left; if (files_left == 0) { break; } } } } } } /* * pass1_read_record: read a record for pass 1 * * This routine reads a record into the cache, symbol table, and record * array. * For each record read, a unique hash value is computed. * That hash value is an index into the symbol table. * * Return value: * 0: record read as requested. * -1: end of the file was reached. */ int pass1_read_record (file_ptr, index) file_type * file_ptr; /* input -- Description of the file to read record from. */ int index; /* input -- Record number of the record currently being read. */ { static cache_entry_type * cache_ptr = 0; /* Pointer to the cache entry which is used to read into. This cache buffer is linked onto the cache list if this is the first time the record is read. */ register int *end_int_ptr; /* Pointer to last integer in hashing algorithm */ cache_entry_type * found_cache_ptr; /* Pointer to the cache entry found by hashing */ register int *found_int_ptr;/* co-erced pointer to found record */ int hash_code; /* index into symbol table */ int i; /* Misc. variable */ register int *int_ptr;/* Co-erced pointer to read record */ char mbuffer[LINE_LENGTH];/* Misc. buffer */ int probe_count; /* Number a rehashes attempted. */ int quotient; /* quotient of sum/table size. This value is used as the increment when re-hashing into the symbol table */ cache_entry_type * reread_cache_ptr; /* Pointer to the cache buffer used for cache re-read operation. */ file_type * reread_file_ptr; /* Pointer to the file to read from on a cache re-read operation. */ int reread_index; /* Record number to read on a cache re-read operation. */ rfa_type rfa; /* RFA of record read */ int status; /* Misc. status variable */ register int sum; /* Sum of all bytes in a record */ /* * If we don't have a cache buffer laying around, * delink one from end of list. */ if (cache_ptr == 0) { deq_tail_dll (cache_head_ptr, cache_tail_ptr, cache_ptr, cache_next_ptr, cache_prev_ptr); if (cache_ptr -> hash_code != HASH_FREE_ENTRY) { sym_tab_cache_ptr[cache_ptr -> hash_code] = (cache_entry_type *) CACHE_NOT_IN_CACHE; } } /* * Read the next record into our local cache entry. */ rfa = ftell(file_ptr->seq_fd); if (p1_debug) { printf ("index: %d hash_col: %d cache_miss: %d", index, hash_collisions, cache_miss); } status = read_into_cache( file_ptr -> seq_fd, rfa, cache_ptr ); if (status < 0) { return (-1); } /* * Pad to end of word with blanks, this allows hashing and comparison * algorithms to be word oriented rather than byte oriented. * * Compress the record based on input parameters. */ if (cache_ptr -> record_length >= 0) { if (compress_records) { pass1_record_compress (cache_ptr -> recordp, &cache_ptr -> record_length); } /* Pad to the end of the word with blanks */ for (i = 0; i < sizeof (int) - 1; ++i) { cache_ptr->recordp[cache_ptr->record_length + i] = ' '; } } /* * Compute hash code for the record. * * The value is shifted on each iteration to ensure that the * magnitude of the sum is large enough. Otherwise when the sum is * divided by the size of the symbol table, the remainder will be * clustered toward the beginning of the table. */ sum = 0; int_ptr = (int *) cache_ptr -> recordp; end_int_ptr = int_ptr + ((cache_ptr->record_length + sizeof (int) - 1) / sizeof (int)); for (; int_ptr < end_int_ptr; int_ptr++) { /* if (p1_debug) { printf ("sum: %x ", sum); }*/ /* * The next statement does a left rotate of 7. The add operation is * used instead of a logical-or operation since the right shift * is an arithmetic shift which propogates the sign bit. An * or operation into 25 bits of propogated sign bit loses a lot * of significance. */ sum = (sum >> (sizeof (int) * 8 - 7)) + (sum << 7);/* left rotate 7 */ /* if (p1_debug) { printf ("sum<<7: %x ", sum); } */ sum += *int_ptr; /* if (p1_debug) { printf ("sum:%x value:%X\n", sum, *int_ptr); } */ } sum = abs (sum); if (p1_debug) { printf ("sum: %d ", sum); } hash_code = sum % sym_tab_size; quotient = sum / sym_tab_size; if ((quotient % sym_tab_size) == 0) { quotient = 1; } /* * Loop until a unique hash code is found. */ for (probe_count = 0; probe_count < sym_tab_size; ++probe_count, ++hash_collisions) { do { hash_code = (hash_code + quotient) % sym_tab_size; } while (hash_code == 0);/* ensure sym tab loc 0 is never used */ /* * if symbol table entry is used but record is not is cache, * read record into cache by flushing the least recently * entry entry from the cache. */ if (p1_debug) { printf ("hash_code: %d\n", hash_code); } if ((int) sym_tab_cache_ptr[hash_code] == CACHE_NOT_IN_CACHE) { cache_miss++; if (p1_debug) { printf ("cache_miss\n"); } deq_tail_dll (cache_head_ptr, cache_tail_ptr, reread_cache_ptr, cache_next_ptr, cache_prev_ptr); if (reread_cache_ptr -> hash_code != HASH_FREE_ENTRY) { sym_tab_cache_ptr[reread_cache_ptr -> hash_code] = (cache_entry_type *) CACHE_NOT_IN_CACHE; } for (i = 0; i < file_count; ++i) { if (files[i].sym_tab_index[hash_code] != 0) { reread_file_ptr = &files[i]; break; } } if (i == file_count) { error ("Consistency error on rehash read"); } reread_index = abs (reread_file_ptr -> sym_tab_index[hash_code]); reread_into_cache( reread_file_ptr, reread_index, reread_cache_ptr); if (reread_cache_ptr -> record_length >= 0) { if (compress_records) { pass1_record_compress ( reread_cache_ptr -> recordp, &reread_cache_ptr -> record_length); } for (i = 0; i < sizeof (int) - 1; ++i) { reread_cache_ptr-> recordp[reread_cache_ptr->record_length + i] = ' '; } } if (p1_debug) { printf ("%d %s\n", reread_cache_ptr -> record_length, reread_cache_ptr -> recordp); } sym_tab_cache_ptr[hash_code] = reread_cache_ptr; reread_cache_ptr -> hash_code = hash_code; enq_head_dll (cache_head_ptr, cache_tail_ptr, reread_cache_ptr, cache_next_ptr, cache_prev_ptr); } /* * if this is the first time this hash code has been used, * point the symbol table to the cache entry, * insert the cache entry in the front of the cache. * break out of loop. */ if (sym_tab_cache_ptr[hash_code] == CACHE_FREE_ENTRY) { if (p1_debug) { printf ("cache_free_entry\n"); } sym_tab_cache_ptr[hash_code] = cache_ptr; cache_ptr -> hash_code = hash_code; enq_head_dll (cache_head_ptr, cache_tail_ptr, cache_ptr, cache_next_ptr, cache_prev_ptr); cache_ptr = 0; break; /* * if this hash code is used and the record is in cache, * Compare our record with the one in the cache. * if the records are the same, * move cache entry to front of list, * break out of the loop. */ } else { found_cache_ptr = sym_tab_cache_ptr[hash_code]; if (p1_debug) { printf ("len1: %d len2: %d\n", cache_ptr -> record_length, found_cache_ptr -> record_length); } if (cache_ptr->record_length == found_cache_ptr->record_length) { found_int_ptr = (int *) found_cache_ptr->recordp; int_ptr = (int *) cache_ptr -> recordp; end_int_ptr = int_ptr + ((cache_ptr -> record_length + sizeof (int) - 1) / sizeof (int)); for (; int_ptr < end_int_ptr; int_ptr++) { /* printf( "v1: %x v2:%x\n", *int_ptr, *found_int_ptr ) ; */ if (*int_ptr != *found_int_ptr) { break; } found_int_ptr++; } if (int_ptr >= end_int_ptr) { rem_dll (cache_head_ptr, cache_tail_ptr, found_cache_ptr, cache_next_ptr, cache_prev_ptr); enq_head_dll (cache_head_ptr, cache_tail_ptr, found_cache_ptr, cache_next_ptr, cache_prev_ptr); break; } } } } if (probe_count == sym_tab_size) { error ("Symbol table overflow"); } /* * If the record array for the file has overflowed, * fatal error. */ if (index + 1 >= file_ptr -> record_array_alloc) { error ("record array size computed wrong"); } /* * Install record into record array. */ file_ptr -> record[index].rfa = rfa; file_ptr -> record[index].value[0] = -hash_code; file_ptr -> record[index].value[1] = -hash_code; /* * Install line into symbol table. */ if (file_ptr -> sym_tab_index[hash_code] == 0) { file_ptr -> sym_tab_index[hash_code] = index; } else { file_ptr -> sym_tab_index[hash_code] = -index; } return (0); } /* * pass1_record_compress: compress out insignificant bytes in a record. * * This routine handles options which compares only a portion of the * record. For the -b option, all occurrences of multiple blanks are * compressed into a single blank. For the -c option, all columns not * specified are removed. * * Return value: * This procedure has no return value. */ void pass1_record_compress (record_ptr, record_len_ptr) char *record_ptr; /* input/output */ /* Record to be compressed in place. */ int *record_len_ptr; /* input/output */ /* Length of record. */ { int i; /* Misc. variable */ int j; /* Misc. variable */ int k; /* Misc. variable */ bool space; /* TRUE if currently doing a space or tab sequence */ /* * Handle column ranges: */ if (column_count != 0) { k = 0; for (i = 0; i < column_count; ++i) { for (j = first_column[i]; j <= last_column[i] && j < *record_len_ptr; ++j) { record_ptr[k] = record_ptr[j]; ++k; } } *record_len_ptr = k; } /* * Handle blank compression: * * Blank compression converts all groups of blanks and horizontal * tabs to a single blank. */ if (blank_compress) { k = 0; space = FALSE; for (i = 0; i < *record_len_ptr; ++i) { if (record_ptr[i] == ' ' || record_ptr[i] == '\t') { space = TRUE; } else { if (space) { record_ptr[k++] = ' '; space = FALSE; } record_ptr[k++] = record_ptr[i]; } } if (space) { record_ptr[k++] = ' '; } *record_len_ptr = k; } /* * Handle blank removal: * * Blank removal removes all blanks and horizontal tabs. */ if (blank_remove) { k = 0; space = FALSE; for (i = 0; i < *record_len_ptr; ++i) { if (record_ptr[i] == ' ' || record_ptr[i] == '\t') { space = TRUE; } else { if (space) { space = FALSE; } record_ptr[k++] = record_ptr[i]; } } *record_len_ptr = k; } }